In [ ]:
from __future__ import print_function
from sklearn import __version__ as sklearn_version
print('Sklearn version:', sklearn_version)
In [ ]:
from sklearn import datasets
all_data = datasets.california_housing.fetch_california_housing()
# Describe dataset
print(all_data.DESCR)
print(all_data.feature_names)
In [ ]:
# Print some data lines
print(all_data.data[:10])
print(all_data.target)
In [ ]:
#Randomize, normalize and separate train & test
from sklearn.utils import shuffle
X, y = shuffle(all_data.data, all_data.target, random_state=42)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
# Normalize the data
from sklearn.preprocessing import Normalizer
# Define normalizer
...
#Fit & transform over trin
...
# transform test
...
In [ ]:
In [ ]:
from sklearn import linear_model
# Select the correct linear model and fit it
reg = linear_model. ...
reg.fit(X_train, y_train)
# Evaluate
from sklearn.metrics import mean_absolute_error
y_test_predict = reg.predict(X_test)
print('Mean absolute error ', mean_absolute_error(y_test, y_test_predict))
print('Variance score: ', reg.score(X_test, y_test))
In [ ]:
# Plot a scaterplot real vs predict
import matplotlib.pyplot as plt
%matplotlib inline
# Plot the scatter plot real vs predict
...
In [ ]:
# Save model
from sklearn.externals import joblib
joblib.dump(reg, '/tmp/reg_model.pkl')
In [ ]:
# Load model
reg_loaded = joblib.load('/tmp/reg_model.pkl')
In [ ]:
# View the coeficients
print('Coeficients :', reg_loaded.coef_)
print('Intercept: ', reg_loaded.intercept_ )
In [ ]:
In [ ]:
# Use the function RidgeCV to select the best alpha using cross validation
#Define the RidgeCV model. Test alpha over the values 0.1, 1 and 10
...
reg.fit(X_train, y_train)
print('Best alpha: ', reg.alpha_)
In [ ]:
# Build a model with the recommended alpha
reg = linear_model.Ridge (alpha = ...)
reg.fit(X_train, y_train)
y_test_predict = reg.predict(X_test)
print('Mean absolute error ', mean_absolute_error(y_test, y_test_predict))
print('Variance score: ', reg.score(X_test, y_test))
plt.scatter(y_test, y_test_predict)
In [ ]:
In [ ]:
from sklearn import svm
# Select the correct model and define it
reg_svr = ...
reg_svr.fit(X_train, y_train)
y_test_predict = reg_svr.predict(X_test)
print('Mean absolute error ', mean_absolute_error(y_test, y_test_predict))
print('Variance score: ', reg_svr.score(X_test, y_test))
plt.scatter(y_test, y_test_predict)
In [ ]:
In [ ]:
# Import the regression tree function
from sklearn import ...
# Define the tree
...
dtree.fit(X_train, y_train)
y_test_predict = dtree.predict(X_test)
print('Mean absolute error ', mean_absolute_error(y_test, y_test_predict))
print('Variance score: ', dtree.score(X_test, y_test))
plt.scatter(y_test, y_test_predict)
In [ ]:
# A second model regularized controling the depth
# Build a second tree with a max deep of 5
...
...
y_test_predict = dtree2.predict(X_test)
print('Mean absolute error ', mean_absolute_error(y_test, y_test_predict))
print('Variance score: ', dtree2.score(X_test, y_test))
plt.scatter(y_test, y_test_predict)
In [ ]:
# Plot the tree
import pydotplus
from IPython.display import Image
dot_data = tree.export_graphviz(dtree2, out_file=None,
feature_names=all_data.feature_names,
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
In [ ]: